######################################################################################################
# Generic functions
# Authors: Kaspar Wuthrich and Ying Zhu
# DISCLAIMER: This software is provided "as is" without warranty of any kind, expressed or implied. 
# Questions/error reports: kwuthrich@ucsd.edu
######################################################################################################

### Generate data with pre-specified R2

gen_design <- function(n,p,k,RsqY,RsqD,alpha0,dgp="homosc"){
  
  tbeta <- tgamma <- rep(0,p)
  tbeta[1:k] <- tgamma[1:k] <- 1
  
  # Homoscedastic dgp; allows alpha0!=0
  if(dgp=="homosc"){
    
    cD <- sqrt(RsqD/(k-RsqD*k))
    
    if (alpha0==0){
      cY <- sqrt(RsqY/(k-RsqY*k))
    }
    if (alpha0!=0){
      a   <- k*(RsqY-1)
      b   <- 2*alpha0*cD*k*(RsqY-1)
      c   <- alpha0^2*cD^2*k*(RsqY-1)+(alpha0^2+1)*RsqY
      cY  <- max(Re(polyroot(c(c,b,a))))
    }

    gamma <-  cD*tgamma
    beta  <-  cY*tbeta
    
    X <- matrix(rnorm(n*p),n,p)
    d <- X%*%gamma + rnorm(n)
    y <- alpha0*d + X%*%beta + rnorm(n)
    
  }
  
  # Heteroscedastic dgp; designed for alpha0=0
  if(dgp=="heterosc"){
    
    cD <- sqrt(RsqD/(k-RsqD*k))
    cY <- sqrt(RsqY/(k-RsqY*k))
    
    gamma <-  cD*tgamma
    beta  <-  cY*tbeta
    
    X     <- matrix(rnorm(n*p),n,p)
    sig_d <- sqrt((1+X%*%gamma)^2/mean((1+X%*%gamma)^2))
    sig_y <- sqrt((1+X%*%beta)^2/mean((1+X%*%beta)^2))
    d     <- X%*%gamma + sig_d*rnorm(n)
    y     <- X%*%beta + sig_y*rnorm(n)
    
  }
  
  # Bernoulli regressors; designed for alpha0=0
  if(dgp=="bern"){
    
    cD <- sqrt(RsqD/(k*0.25*(1-RsqD)))
    cY <- sqrt(RsqY/(k*0.25*(1-RsqY)))
    
    gamma <-  cD*tgamma
    beta  <-  cY*tbeta
    
    X <- matrix(rbinom(n*p,1,0.5),n,p)
    d <- X%*%gamma + rnorm(n)
    y <- X%*%beta + rnorm(n)
    
  }
  
  # t-distributed errors; designed for alpha0=0
  if(dgp=="tdistr"){
    
    cD <- sqrt(RsqD/(k-RsqD*k))
    cY <- sqrt(RsqY/(k-RsqY*k))
    
    gamma <-  cD*tgamma
    beta  <-  cY*tbeta
    
    X <- matrix(rnorm(n*p),n,p)
    d <- X%*%gamma + rt(n,df=5)/sqrt(5/3)
    y <- X%*%beta + rt(n,df=5)/sqrt(5/3)
    
  }
  
  return(list(X=X,d=d,y=y))
  
}

### Post double Lasso with CV regularization choice

pdl_cv <- function(y,d,X,cv){
  
  cvfit1  <- cv.glmnet(X,d,nfolds=5)
  cvfit2  <- cv.glmnet(X,y,nfolds=5)
  
  if (cv=="min"){
    gamma   <- as.matrix(coef(cvfit1, s = "lambda.min")[-1])
    pi      <- as.matrix(coef(cvfit2, s = "lambda.min")[-1])
  }
  if (cv=="1se"){
    gamma   <- as.matrix(coef(cvfit1, s = "lambda.1se")[-1])
    pi      <- as.matrix(coef(cvfit2, s = "lambda.1se")[-1])
  }
  
  ind <- (abs(gamma) + abs(pi) > 0)
  
  if (sum(ind)==0) {
    obj <- lm(y ~ d)
  }
  if (sum(ind)>0){
    Xsel  <- X[,ind]
    obj   <- lm(y ~ d + Xsel)
  } 
  
  vcov      <- vcovHC(obj, type = "HC1")
  robust_se <- sqrt(diag(vcov))
  
  alpha_hat <- obj$coefficients[2]
  se_hat    <- robust_se[2]
  
  return(list(alpha_hat=alpha_hat,se_hat=se_hat,sel_index=ind))
  
}

### OLS with HCK standard errors (Cattaneo et al. (2018);
# Code is partly based on the replication material available here: https://github.com/mdcattaneo/replication-CJN_2018_JASA

reg_hck <- function(y,d,X){
  
  n <- length(y)
  
  Xc    <- cbind(1,X)
  dXc   <- cbind(1,d,X)
  
  b_d   <- chol2inv(chol(crossprod(Xc)))%*%crossprod(Xc,d)
  res_d <- d - Xc %*% b_d
  b_y   <- chol2inv(chol(crossprod(dXc)))%*%crossprod(dXc,y)
  res_y <- y - dXc %*% b_y
  
  M     <- diag(rep(n,1))-Xc%*%chol2inv(chol(crossprod(Xc)))%*%t(Xc)
  Sig   <- (1/n)*t(res_d^2) %*% (chol2inv(chol(M*M)))%*%res_y^2
  Gam   <- mean(res_d^2)
  
  alpha_hat <- b_y[2]
  se_hat    <- sqrt((1/Gam^2)*(Sig/n))
  
  return(c(alpha_hat,se_hat))
  
}

### OLS with robust standard errors

reg_robust <- function(y,d,X,constonly=0){
  
  if (constonly==0) {ols <- lm(y ~ d + X)}
  if (constonly==1) {ols <- lm(y ~ d)}
  
  alpha_hat   <- ols$coef[2]
  vcov        <- vcovHC(ols, type = "HC1")
  robust_se   <- sqrt(diag(vcov))
  se_hat      <- robust_se[2]
  
  return(list(alpha_hat=alpha_hat,se_hat=se_hat))
}

### Debiased Lasso (van der Geer et al. (2014)); code designed for homoscedastic errors
# See Caner & Kock (2018) for an extension and code for heteroscedastic errors

db_cv <- function(y,d,X,cv){
  
  n <- length(y)
  
  y <- y - mean(y)
  d <- d - mean(d)
  X <- scale(X, scale = FALSE)
  
  cvfit1  <- cv.glmnet(X,d,nfolds=5,intercept=FALSE)
  cvfit2  <- cv.glmnet(cbind(d,X),y,nfolds=5,intercept=FALSE)
  
  if (cv=="min"){
    gamma   <- as.matrix(coef(cvfit1, s = "lambda.min")[-1])
    beta    <- as.matrix(coef(cvfit2, s = "lambda.min")[-1])
  }
  if (cv=="1se"){
    gamma   <- as.matrix(coef(cvfit1, s = "lambda.1se")[-1])
    beta    <- as.matrix(coef(cvfit2, s = "lambda.1se")[-1])
  }
  
  tau2      <- (t(d - X %*% gamma) %*% d)/n
  Theta     <- c(1/tau2)*c(1,-gamma)
  bias      <- (Theta %*% t(cbind(d,X)) %*% (y-cbind(d,X) %*% beta))/n
  alpha_hat <- beta[1] + bias
  u_hat     <- c(y - cbind(d,X)%*%beta)
  se_hat    <- sqrt(mean(u_hat^2))*sqrt((Theta %*% ((t(cbind(d,X)) %*%  cbind(d,X))/n) %*% Theta)/n)

  return(list(alpha_hat=alpha_hat,se_hat=se_hat))
  
}

